In this problem, we will fully work up the data we studied in Tutorial 9b. You downloaded the data via Dropbox. We will consider all images starting with snaps001, snaps002, and snaps003. Be sure to read the README.pdf file from Jin Park describing the data set.
a) Segment all images using the RFP channel, and then compute the mean intensity of each bacterium in the CFP and YFP channels. Show a representative segmentation (one each for the 001, 002, and 003 sets) by overlaying your binary segmented image on a phase image.
import warnings
# Workhorse
import numpy as np
import pandas as pd
# Find files
import glob
import os
# A whole bunch of skimage stuff
import skimage.feature
import skimage.filter
import skimage.filter.rank
import skimage.io
import skimage.morphology
import skimage.restoration
import skimage.segmentation
import skimage.transform
# Regionprops_to_df
import sys
sys.path.insert(0, '/Users/chigozie/git/regionprops_to_df/')
import regionprops_to_df
# And some useful scipy.ndimage stuff
import scipy.ndimage
# Import plotting tools
import matplotlib.pyplot as plt
import seaborn as sns
# Interaction
import ipywidgets
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline
# This enables high res graphics inline (only use with static plots (non-Bokeh))
# SVG is preferred, but there is a bug in Jupyter with vertical lines
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# The directory containing daytime data
data_dir = '../../bebi103/data/park_et_al/'
# Glob string for images
im_list = []
for i in range(1,4):
im_glob = os.path.join(data_dir, 'snaps00%s*.tif' % i)
im_list += glob.glob(im_glob)
# Let's look at the first 10 entries
im_list[:10]
# Make data frame containing images
df = pd.DataFrame(columns=['strain', 'field_of_view', 'channel', 'filepath', 'image', 'histogram'])
df.filepath = im_list
df.strain = [i[-13:-10] for i in df.filepath]
df.field_of_view = [i[-9:-6] for i in df.filepath]
df.channel = [i[-5] for i in df.filepath]
df.image = [skimage.io.imread(i) for i in df.filepath]
df.histogram = [skimage.exposure.histogram(im) for im in df.image]
# Take a look
df.head()
# Upsample p, c, y (2 means 2x as big, order=0 means no interpolation)
upsample_list = [scipy.ndimage.zoom(im, 2, order=0) for im in df.loc[df.channel!='r','image']]
df.loc[df.channel!='r','image'] = pd.Series(upsample_list, index=df[df.channel!='r'].index)
# So we have it, the interpixel distance
physical_size = 0.065 # microns
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 4, figsize=(12,40))
ax[0,0].set_title('Phase')
ax[0,1].set_title('RFP (segment)')
ax[0,2].set_title('CFP')
ax[0,3].set_title('YFP')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='p','image'].values[i], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','image'].values[i], cmap=plt.cm.viridis)
ax[i,2].imshow(df.loc[df.channel=='c','image'].values[i], cmap=plt.cm.viridis)
ax[i,3].imshow(df.loc[df.channel=='y','image'].values[i], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
#ax[1].imshow(df.image[0][subim], cmap=plt.cm.viridis)
# Zoom in
subim = np.s_[100:300, 850:1050]
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 4, figsize=(12,40))
ax[0,0].set_title('Phase')
ax[0,1].set_title('RFP (segment)')
ax[0,2].set_title('CFP')
ax[0,3].set_title('YFP')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='p','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,2].imshow(df.loc[df.channel=='c','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,3].imshow(df.loc[df.channel=='y','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
#ax[1].imshow(df.image[0][subim], cmap=plt.cm.viridis)
# Look at all the histograms for strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 4, figsize=(12,40))
ax[0,0].set_title('Phase')
ax[0,1].set_title('RFP (segment)')
ax[0,2].set_title('CFP')
ax[0,3].set_title('YFP')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
hist_p, bins_p = df.loc[df.channel=='p','histogram'].values[i]
hist_r, bins_r = df.loc[df.channel=='r','histogram'].values[i]
hist_c, bins_c = df.loc[df.channel=='c','histogram'].values[i]
hist_y, bins_y = df.loc[df.channel=='y','histogram'].values[i]
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
ax[i,0].fill_between(bins_p, hist_p, cmap=plt.cm.viridis)
ax[i,1].fill_between(bins_r, hist_r, cmap=plt.cm.viridis)
ax[i,2].fill_between(bins_c, hist_c, cmap=plt.cm.viridis)
ax[i,3].fill_between(bins_y, hist_y, cmap=plt.cm.viridis)
# Make log scale
for i in ax:
for j in i:
j.set_yscale('log')
There are clearly some strange images here. For example, the phase of 006 is unimodal. Looking back at the image itself, we can see that there was inversion of the image. We need to deal with this somehow.
Let's try a very simple segmentation first. It looks from the histograms like values below about 500 in the RFP channel are background.
# Make threshold greater than 500
df.thresh = None
df.loc[df.channel=='r','thresh'] = [df.loc[(df.channel=='r'),'image'].values[i]>500 for i, _ in enumerate(df.loc[(df.channel=='r'),'image'])]
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 2, figsize=(12,40))
ax[0,0].set_title('Threshold (>500)')
ax[0,1].set_title('RFP (segment)')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='r','thresh'].values[i][subim], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
This is OK, but clearly not perfect. Instead, let's try the thresholding function from Tutorial 9:
# Reuse thresholding function from Tutorial 9
def bebi103_thresh(im, selem, white_true=True, k_range=(0.5, 1.5),
min_size=100):
"""
Threshold image as described above. Morphological mean filter is
applied using selem.
"""
# Determine comparison operator
if white_true:
compare = np.greater
sign = -1
else:
compare = np.less
sign = 1
# Do the mean filter
im_mean = skimage.filter.rank.mean(im, selem)
# Compute number of pixels in binary image as a function of k
k = np.linspace(k_range[0], k_range[1], 100)
n_pix = np.empty_like(k)
for i in range(len(k)):
n_pix[i] = compare(im, k[i] * im_mean).sum()
# Compute rough second derivative
dn_pix_dk2 = np.diff(np.diff(n_pix))
# Find index of maximal second derivative
max_ind = np.argmax(sign * dn_pix_dk2)
# Use this index to set k
k_opt = k[max_ind - sign * 2]
# Threshold with this k
im_bw = compare(im, k_opt * im_mean)
# Remove all the small objects
im_bw = skimage.morphology.remove_small_objects(im_bw, min_size=min_size)
return im_bw, k_opt, im_mean
# Threshold
df.thresh = None
selem = skimage.morphology.disk(10)
df.loc[df.channel=='r','thresh'] = [bebi103_thresh(df.loc[(df.channel=='r'),'image'].values[i], selem) for i, _ in enumerate(df.loc[(df.channel=='r'),'image'])]
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 2, figsize=(12,40))
ax[0,0].set_title('Threshold (sophisticated)')
ax[0,1].set_title('RFP (segment)')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='r','thresh'].values[i][0][subim], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
Pretty good! There are some small regions which are false positives, though. We should increase the "min_size" parameter.
# Threshold
df.thresh = None
selem = skimage.morphology.disk(5)
df.loc[df.channel=='r','thresh'] = [bebi103_thresh(df.loc[(df.channel=='r'),'image'].values[i], selem, min_size=200) for i, _ in enumerate(df.loc[(df.channel=='r'),'image'])]
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 2, figsize=(12,80))
ax[0,0].set_title('Threshold (sophisticated)')
ax[0,1].set_title('RFP (segment)')
for i, _ in enumerate(df.index[(df.channel=='c') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='r','thresh'].values[i][0][subim], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='c','filepath'].values[i][-13:-4])
def zero_crossing_filter(im, thresh):
"""
Returns image with 1 if there is a zero crossing and 0 otherwise.
thresh is the the minimal value of the gradient, as computed by Sobel
filter, at crossing to count as a crossing.
"""
# Square structuring element
selem = skimage.morphology.square(3)
# Do max filter and min filter
im_max = scipy.ndimage.filters.maximum_filter(im, footprint=selem)
im_min = scipy.ndimage.filters.minimum_filter(im, footprint=selem)
# Compute gradients using Sobel filter
im_grad = skimage.filter.sobel(im)
# Return edges
return (((im >= 0) & (im_min < 0)) | ((im <= 0) & (im_max > 0))) \
& (im_grad >= thresh)
df.image.values[2].astype(float)
im_float = df.image.values[2].astype(float)
im_LoG = scipy.ndimage.filters.gaussian_laplace(im_float, 2.0)
plt.imshow(im_LoG[subim], cmap=plt.cm.RdBu_r)
def edgethresh(threshold):
im_edge = zero_crossing_filter(im_LoG, threshold)
plt.imshow(im_edge[subim], cmap=plt.cm.RdBu_r)
ipywidgets.interact(edgethresh, threshold=(0,10,0.001))
df['im_float'] = [im.astype(float) for im in df.image]
# Compute LoG
# im_LoG = scipy.ndimage.filters.gaussian_laplace(im_float, 2.0)
im_LoG_list = []
im_edges_list = []
for i, _ in enumerate(df.loc[(df.channel=='r'),'im_float']):
im_LoG = scipy.ndimage.filters.gaussian_laplace(df.loc[(df.channel=='r'),'im_float'].values[i], 2.0)
im_LoG_list.append(im_LoG)
im_edges_list.append(zero_crossing_filter(im_LoG, 3))
# pandas doesn't like setting a column to a list of arrays, so I first make the list into a pandas Series:
df.loc[df.channel=='r','LoG'] = pd.Series(im_LoG_list, index=df[df.channel=='r'].index)
# Skeletonize edges
im_edges_list = [skimage.morphology.skeletonize(im) for im in im_edges_list]
df.loc[df.channel=='r','edges'] = im_edges_list
# Fill holes
df.loc[df.channel=='r','segmented'] = [scipy.ndimage.morphology.binary_fill_holes(im) for im in df.loc[df.channel=='r','edges']]
# Remove small objects that are not bacteria
df.loc[df.channel=='r','segmented'] = [skimage.morphology.remove_small_objects(im, min_size=100) for im in df.loc[df.channel=='r','segmented']]
# Clear border with large buffer size b/c LoG procedure came off border
df.loc[df.channel=='r','segmented'] = [skimage.segmentation.clear_border(im, buffer_size=5) for im in df.loc[df.channel=='r','segmented']]
# df.loc[df.channel=='r','edges'] = [scipy.ndimage.filters.gaussian_laplace(df.loc[(df.channel=='r'),'image'].values[i], 2.0) for i, _ in enumerate(df.loc[(df.channel=='r'),'image'])]
plt.imshow(df['segmented'][2], cmap=plt.cm.viridis)
plt.imshow(skimage.measure.label(df['segmented'][2], background=0))
# Label binary image; background kwarg says value in im_bw to consider backgr.
labeled_list = [skimage.measure.label(im, background=0) for im in df.loc[df.channel=='r','segmented']]
# pandas doesn't like setting a column to a list of arrays, so I first make the list into a pandas Series:
df.loc[df.channel=='r','labeled'] = pd.Series(labeled_list, index=df[df.channel=='r'].index)
# Look at all the strain 001 images
with sns.axes_style('white'):
fig, ax = plt.subplots(len(df.index[(df.channel=='c') & (df.strain=='001')]), 3, figsize=(12,80))
ax[0,0].set_title('Original image')
ax[0,1].set_title('Edges')
ax[0,2].set_title('Labeled')
for i, _ in enumerate(df.index[(df.channel=='r') & (df.strain=='001')]):
ax[i,0].imshow(df.loc[df.channel=='r','image'].values[i][subim], cmap=plt.cm.viridis)
ax[i,1].imshow(df.loc[df.channel=='r','edges'].values[i][subim], cmap=plt.cm.viridis)
ax[i,2].imshow(df.loc[df.channel=='r','labeled'].values[i][subim], cmap=plt.cm.viridis)
ax[i,0].set_ylabel(df.loc[df.channel=='r','filepath'].values[i][-13:-4])
# Get regionprops for all images,
# using labeled segmentation of corresponding red channel
regionprops_df = pd.DataFrame()
for i in df.index:
regionprops = skimage.measure.regionprops\
(df.loc[(df.strain==df.ix[i]['strain'])
& (df.field_of_view==df.ix[i]['field_of_view'])
& (df.channel=='r'),'labeled'].values[0],
intensity_image=df.ix[i]['image'])
regionprops_df_temp = regionprops_to_df.regionprops_to_df(regionprops)
regionprops_df_temp['strain'] = df.ix[i]['strain']
regionprops_df_temp['field_of_view'] = df.ix[i]['field_of_view']
regionprops_df_temp['channel'] = df.ix[i]['channel']
regionprops_df = regionprops_df.append(regionprops_df_temp, ignore_index=True)
regionprops_df
# For my representative segmentations,
# instead of converting the image to RGB
# and colouring one of the channels with the segment,
# I will overlay the segment on top of the phase image
# with the alpha of the background of the segment image
# set to zero. This allows me to keep the colour information
# of the labels.
overlay_cmap = plt.cm.rainbow
overlay_cmap._init()
overlay_cmap._lut[0,-1] = 0
with sns.axes_style('white'):
fig, ax = plt.subplots(3,1, figsize=(20,20))
plt.suptitle('Representative segmentations:\n\
The phase image of the first frame of each strain,\n\
with labeled segmentations overlaid on top', size=20)
for i in range(3):
strain = '00' + str(i+1)
ax[i].set_title(df.loc[(df.channel=='p') & (df.strain==strain),'filepath'].values[0])
ax[i].imshow(df.loc[(df.channel=='p') & (df.strain==strain),'image'].values[0])
ax[i].imshow(df.loc[(df.channel=='r') & (df.strain==strain),'labeled'].values[0], cmap=overlay_cmap)
b) As discussed in class, we are not entirely sure of how to analyze the correlation or lack thereof of the two different σ factors. Generate plots that you think may be useful and/or develop metrics for correlation among the samples. This question is very open-ended and is your chance to experience thinking about real results with the members of your group.
for i in range(3):
strain = ['001','002','003'][i]
cval = regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.channel=='c'),'mean_intensity']
yval = regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.channel=='y'),'mean_intensity']
color = 'rgb'[i]
_ = plt.scatter([cval],[yval], color=color, alpha=0.2)
# Pearson correlation
r, _ = scipy.stats.pearsonr(cval,yval)
print('Pearson r for strain', strain,'=', r)
_ = plt.legend(['Strain ' + str(i+1) for i in range(3)])
_ = plt.xlabel('CFP intensity')
_ = plt.ylabel('YFP intensity')
It seems clear that YFP and CFP intensities are correlated with each other far more in strains 2 and 3 than in strain 1.
But is this small or large? Let's take a cue from Costes' method in the colocalisation tutorial, and randomise pairs of mean intensities.
# Shuffle only within each field of view, as exposure differences between fields of view would artificially lower the correlation.
for shuffle_number in range(100):
shuffle_number_column_heading = 'shuffle_{:03d}'.format(shuffle_number)
for strain in regionprops_df.strain.unique():
for field_of_view in regionprops_df.loc[(regionprops_df.strain==strain),'field_of_view'].unique():
listtoshuffle = regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.field_of_view==field_of_view) & (regionprops_df.channel=='c'),'mean_intensity'].values
np.random.shuffle(listtoshuffle)
regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.field_of_view==field_of_view) & (regionprops_df.channel=='c'),shuffle_number_column_heading] = listtoshuffle
#regionprops_df.loc[(regionprops_df.strain=='002') & (regionprops_df.channel=='c')]
df_pearson = pd.DataFrame()
for shuffle_number in range(2):
shuffle_number_column_heading = 'shuffle_{:03d}'.format(shuffle_number)
for strain in regionprops_df.strain.unique():
cval = regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.channel=='c'),shuffle_number_column_heading]
yval = regionprops_df.loc[(regionprops_df.strain==strain) & (regionprops_df.channel=='y'),'mean_intensity']
# Pearson correlation
r, _ = scipy.stats.pearsonr(cval,yval)
df_pearson_temp = pd.DataFrame(columns=['strain', 'r'])
df_pearson_temp.strain = pd.Series(strain)
df_pearson_temp.r = pd.Series(r)
df_pearson = df_pearson.append(df_pearson_temp)
for strain in regionprops_df.strain.unique():
print('The Mean Pearson r coefficient for randomised pairings of strain', strain, 'is', np.mean(df_pearson.loc[df_pearson.strain==strain,'r']), 'with s.d.', np.std(df_pearson.loc[df_pearson.strain==strain,'r']))
What does this mean? Recall from tutorial 9a:
There are four different strains. We have
The central question is: do the $\sigma$ factors "share" the available RNA polymerases at the same time, or does one $\sigma$ factor get all of the polymerases while the other does not. We have found that the intensity of CFP and YFP in each cell in strain 1 appears to be uncorrelated (compare to randomised pairs, and contrast with strain 2 and 3), which suggests that whether or not $\sigma_B$ is using RNA polymerase in a cell is independent of whether $\sigma_W$ is.